---
title: "Replication_appendix"
author: "Sebastian Haunss, Priska Daphi, Jan Matti Dollbaum, Lidiya Hristova, Pál Susánszky, Elias Steinhilper"
date: '2025-02-18'
output: 
  html_document:
    theme: cerulean
    toc: yes
    toc_float:
      collapsed: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
library(data.table)
library(kableExtra)
```
## Appendix B: Testing a simplified version of the form codebook


### Table B2

...

### Table B3

Load data

```{r}
protest_forms_predicted <- fread(file = "../data/AppendixB3_protest_sentences_formprediction.csv") %>% 
  mutate(prediction = as.numeric(as.character(prediction)),
         pred_form = case_when(prediction == 99 ~ 0,
                               TRUE ~ prediction)) %>% 
  select(-prediction)

fgz_forms <- fread(file = "../data/AppendixB3_fgz_forms.csv")

```

Preparations

```{r}
ans <- fgz_forms$AN %>% 
  unique()

# summarize forms per article
fgz_forms_per_article <- fgz_forms %>% 
  select(fid=AN, form_nr) %>% 
  unique() %>% 
  group_by(fid) %>% 
  mutate(cid = paste0("F",row_number())) %>% 
  pivot_wider(values_from = form_nr, names_from = cid) %>% 
  ungroup() %>% 
  arrange(fid)

predicted_forms <- protest_forms_predicted %>% 
  filter(fid %in% ans) %>% 
  select(fid, form_nr=pred_form) %>% 
  group_by(fid, form_nr) %>% 
  mutate(form_count = n()) %>% 
  slice(1) %>% 
  arrange(fid) %>% 
  filter(form_nr > 0)

# join predicted forms and actual claim data
forms_comparison <- left_join(predicted_forms, fgz_forms_per_article, by="fid") %>% 
  mutate(eval = 0, .before="F1") %>% 
  group_by(fid) %>% 
  mutate(n_found_forms = n(), .before="F1") %>% 
  ungroup()

# aggregate at article level
predicted_forms_per_article <- forms_comparison %>% 
  select(fid, form_nr, form_count, eval, F1) %>% 
  rowwise() %>% 
  mutate(eval_sum = sum(eval*form_count)) %>% 
  group_by(fid) %>% 
  arrange(desc(eval_sum), .by_group = T) %>% 
  slice(1) %>% 
  ungroup()
```

Checking whether the predicted forms match any of the actual forms in the respective article

```{r}

for (i in 1:dim(forms_comparison)[1]) {
  forms_comparison$eval[i] <- 
    case_when((forms_comparison$form_nr[i] %in% forms_comparison[i,6:11]) ~ 1,
              TRUE ~ 0)
}

```

Aggregate to article level

```{r}

predicted_forms_per_article <- forms_comparison %>% 
  select(fid, form_nr, form_count, eval, F1) %>% 
  rowwise() %>% 
  mutate(eval_sum = sum(eval*form_count)) %>% 
  group_by(fid) %>% 
  arrange(desc(eval_sum), .by_group = T) %>% 
  slice(1) %>% 
  ungroup()
```

Function for evaluation

```{r}
f1_eval <- function(actual,predicted){
  u <- sort(union(actual, predicted))
  cm = as.matrix(table(factor(actual, u), factor(predicted, u))) # create the confusion matrix
  n = sum(cm) # number of instances
  nc = nrow(cm) # number of classes
  diag = diag(cm) # number of correctly classified instances per class 
  rowsums = apply(cm, 1, sum) # number of instances per class
  colsums = apply(cm, 2, sum) # number of predictions per class
  p = rowsums / n # distribution of instances over the actual classes
  q = colsums / n # distribution of instances over the predicted classes
  accuracy = sum(diag) / n
  precision = diag / colsums 
  recall = diag / rowsums 
  f1 = 2 * precision * recall / (precision + recall) 
  macroPrecision = mean(precision, na.rm = T)
  macroRecall = mean(recall, na.rm = T)
  macroF1 = mean(f1, na.rm = T)
  
  weights = rowsums / n # proportion of instances in each class
  weightedrecall = sum(recall * weights, na.rm = T)  # weighted F1 score
  weightedprecision = sum(precision * weights, na.rm = T)  # weighted F1 score
  weightedF1 = sum(f1 * weights, na.rm = T)  # weighted F1 score
  
  df_eval <- rbind(data.frame(precision, recall, f1), 
                   data.frame(precision=macroPrecision, recall=macroRecall, f1=macroF1, row.names="macro"),
                   data.frame(precision=weightedprecision, recall=weightedrecall, f1=weightedF1, row.names="weighted"))
  
return(df_eval)
}
```

Function to aggregate forms to simpler version on article level

```{r}
simple <- function(x) {
  y <- case_when(x==4 | x==7 | x==15 ~ 1,
                 x==5 | x==8 | x==14 | x==20 | x==21 | x==22 ~ 2,
                 x==3 | x==9 | x==11 | x==12 ~ 3,
                 x==2 | x==13 | x==16 | x==17 | x==18 | x==19 ~ 4,
                 x==10 ~ 5,
                 x==1 | x==6 ~ 6,
                 x==97 ~ 97)
  return(y)
}
```

Perform aggregation

```{r}
predicted_forms_per_article$form_nr_simple <- simple(predicted_forms_per_article$form_nr)
predicted_forms_per_article$F1_simple <- simple(predicted_forms_per_article$F1)

predicted_forms_per_article$eval_simple <- ifelse(predicted_forms_per_article$form_nr_simple==predicted_forms_per_article$F1_simple, 1, 
                                                  predicted_forms_per_article$eval)

```

Evaluation data set

```{r}
eval_Table_B3 <- predicted_forms_per_article %>% 
  mutate(actual = case_when(eval_simple == 1 ~ form_nr_simple,
                            TRUE ~ F1_simple)) %>% 
  select(predicted = form_nr_simple, actual)

df_eval_Table_B3 <- f1_eval(eval_Table_B3$actual, eval_Table_B3$predicted)
df_eval_Table_B3$n <- c(table(eval_Table_B3$actual),sum(table(eval_Table_B3$actual)), sum(table(eval_Table_B3$actual)))

```

Export table

```{r}
write.csv(df_eval_Table_B3, "../tables/Table_B3.csv", row.names = TRUE)
```


### Table C1: Direct comparison of human and machine performance against gold standard

Load data
```{r}
gold_sample <- fread("../data/AppendixC_manual_goldstandard.csv")
claims <- fread("../data/AppendixC_claims.csv")
forms <- fread("../data/AppendixC_forms.csv")

machine_claims <- fread("../data/AppendixC_manual_goldstandard_claims_predicted.csv")
machine_forms <- fread("../data/AppendixC_manual_goldstandard_forms_predicted.csv")


```

Preparations for human annotations

```{r}
# restrict claims and forms to ANs contained in sample
sample_claims <- claims[claims$AN %in% gold_sample$AN]
sample_forms <- forms[forms$AN %in% gold_sample$AN]

# create factor variables
gold_sample$dominant_claim <- as.factor(gold_sample$dominant_claim)
gold_sample$dominant_form <- as.factor(gold_sample$dominant_form)
gold_sample$second_claim <- as.factor(gold_sample$second_claim)
gold_sample$second_form <- as.factor(gold_sample$second_form)

# extract annotated claims for sampled articles and aggregate most frequent to article level
most_frequent_claims <- sample_claims %>%
  group_by(AN, claim_detail) %>%
  dplyr::summarise(freq = n()) %>%   # Count the frequency of each value of claim_detail within each group
  slice_max(freq, n = 1, with_ties = FALSE) %>%  # Select the most frequent value, handle ties by selecting the first
  select(AN, main_annotated_claim = claim_detail)   # Rename claim_detail to main_issue

#extract annotated forms for sampled articles and aggregate most frequent to article level
most_frequent_forms <- sample_forms %>%
  group_by(AN, form) %>%
  dplyr::summarise(freq = n()) %>%   # Count the frequency of each value of claim_detail within each group
  slice_max(freq, n = 1, with_ties = FALSE) %>%  # Select the most frequent value, handle ties by selecting the first
  select(AN, main_annotated_form = form)   # Rename claim_detail to main_issue

# attach most frequently hand annotated claims and forms to goldstandard data

gold_sample <- gold_sample %>%
  left_join(most_frequent_forms, by = "AN")

gold_sample <- gold_sample %>%
  left_join(most_frequent_claims, by = "AN")

# match either dominant or second claim for human annotators
gold_sample$match_either_dominant_or_second_claim_human <- ifelse(gold_sample$dominant_claim == gold_sample$main_annotated_claim | 
                                                              gold_sample$second_claim == gold_sample$main_annotated_claim, 
                                                            gold_sample$main_annotated_claim, "Mismatch")

# match either dominant or second form for human annotators
gold_sample$match_either_dominant_or_second_form_human <- ifelse(gold_sample$dominant_form == gold_sample$main_annotated_form | 
                                                             gold_sample$second_form == gold_sample$main_annotated_form, 
                                                           gold_sample$main_annotated_form, "Mismatch")
```

Preparations for machine annotations

```{r}
claims_verbal <- claims %>% 
  select(claim_detail, claim_detail_numeric) %>% 
  distinct(claim_detail, .keep_all = TRUE)

forms_verbal <- forms %>% 
  select(form_numeric, form) %>% 
  distinct(form_numeric, .keep_all = TRUE)

machine_claims <- left_join(machine_claims, 
                            claims_verbal %>%
                              rename(prediction_claim_verbal = claim_detail,
                                     prediction_claim = claim_detail_numeric),
                            by = "prediction_claim")

machine_forms <- left_join(machine_forms %>% 
                             rename(prediction_form = prediction), 
                           forms_verbal %>%
                              rename(prediction_form_verbal = form,
                                     prediction_form = form_numeric),
                            by = "prediction_form")

# extract annotated claims for sampled articles and aggregate most frequent to article level
most_frequent_machine_claims <- machine_claims %>%
  group_by(AN, prediction_claim_verbal) %>%
  dplyr::summarise(freq = n()) %>%   # Count the frequency of each value of claim_detail within each group
  filter(!is.na(prediction_claim_verbal)) %>% # remove the sentences without claim
  slice_max(freq, n = 1, with_ties = FALSE) %>%  # Select the most frequent value, handle ties by selecting the first
  select(AN, main_machine_claim = prediction_claim_verbal)   # Rename claim_detail to main_issue

# extract annotated forms for sampled articles and aggregate most frequent to article level

most_frequent_machine_forms <- machine_forms %>%
  group_by(AN, prediction_form_verbal) %>%
  dplyr::summarise(freq = n()) %>% # Count the frequency of each value of claim_detail within each group
  filter(!is.na(prediction_form_verbal)) %>% # remove the sentences without form
  slice_max(freq, n = 1, with_ties = FALSE) %>%  # Select the most frequent value, handle ties by selecting the first
  select(AN, main_machine_form = prediction_form_verbal)   # Rename claim_detail to main_issue

# attach most frequently hand annotated claims and forms to goldstandard data

gold_sample <- gold_sample %>%
  left_join(most_frequent_machine_forms, by = "AN")

gold_sample <- gold_sample %>%
  left_join(most_frequent_machine_claims, by = "AN")

# match either dominant or second claim for machine annotations
gold_sample$match_either_dominant_or_second_claim_machine <- ifelse(gold_sample$dominant_claim == gold_sample$main_machine_claim | 
                                         gold_sample$second_claim == gold_sample$main_machine_claim, 
                                       gold_sample$main_machine_claim, "Mismatch")

# match either dominant or second form for machine annotations
gold_sample$match_either_dominant_or_second_form_machine <- ifelse(gold_sample$dominant_form == gold_sample$main_machine_form | 
                                                        gold_sample$second_form == gold_sample$main_machine_form, 
                                                      gold_sample$main_machine_form, "Mismatch")

```



## Table C1

Display information used for table C1 (use the macro and weighted f1 scores from each output)
```{r}
### scenario A
# form: gold vs human
f1_forms_human <- f1_eval(gold_sample$dominant_form, gold_sample$main_annotated_form)
tail(f1_forms_human$f1, 2)
# form: gold vs machine
f1_forms_machine <- f1_eval(gold_sample$dominant_form, gold_sample$main_machine_form)
tail(f1_forms_machine$f1, 2)

# claim: gold vs human
f1_claims_human <- f1_eval(gold_sample$dominant_claim, gold_sample$main_annotated_claim)
tail(f1_claims_human$f1, 2)
# claim: gold vs machine
f1_claims_machine <- f1_eval(gold_sample$dominant_claim, gold_sample$main_machine_claim)
tail(f1_claims_machine$f1, 2)

### scenario B
# form: gold vs human
f1_forms_human <- f1_eval(gold_sample$match_either_dominant_or_second_form_human, gold_sample$main_annotated_form)
tail(f1_forms_human$f1, 2)
# form: gold vs machine
f1_forms_machine <- f1_eval(gold_sample$match_either_dominant_or_second_form_machine, gold_sample$main_machine_form)
tail(f1_forms_machine$f1, 2)

# claim: gold vs human
f1_claims_human <- f1_eval(gold_sample$match_either_dominant_or_second_claim_human, gold_sample$main_annotated_claim)
tail(f1_claims_human$f1, 2)
# claim: gold vs machine
f1_claims_machine <- f1_eval(gold_sample$match_either_dominant_or_second_claim_machine, gold_sample$main_machine_claim)
tail(f1_claims_machine$f1, 2)
```





## Figure C1

```{r}
x <- gold_sample %>% 
  group_by(dominant_form, main_annotated_form = as.factor(main_annotated_form), .drop = FALSE) %>% 
  summarise(freq = n())

ggplot(x[!is.na(x$main_annotated_form),], aes(main_annotated_form, dominant_form)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = freq)) + # background colours are mapped according to the value column
  geom_text(aes(label = freq)) + # write the values
  scale_fill_gradient2(mid = "white", 
                       high = alpha("midnightblue", .8), 
                       midpoint = 0) + # determine the colour
  theme(panel.grid.major.x=element_blank(), #no gridlines
        panel.grid.minor.x=element_blank(), 
        panel.grid.major.y=element_blank(), 
        panel.grid.minor.y=element_blank(),
        legend.position = "none",
        panel.background=element_rect(fill="white"), # background=white
        axis.text.x = element_text(angle=90, hjust = 1,vjust=1),
        plot.title = element_text(size=12,face="bold"),
  ) + 
  scale_x_discrete(name="human annotators") +
  scale_y_discrete(name="dominant form") +
  labs(title = "Dominant form per article")

ggsave("../figures/FigureC1a.pdf", height = 5, width = 8)
ggsave("../figures/FigureC1a.png", height = 5, width = 8, dpi = 1200)


```
```{r}
y <- gold_sample %>% 
  group_by(dominant_form, main_machine_form = as.factor(main_machine_form), .drop = FALSE) %>% 
  summarise(freq = n())

ggplot(y[!is.na(y$main_machine_form),], aes(main_machine_form, dominant_form)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = freq)) + # background colours are mapped according to the value column
  geom_text(aes(label = freq)) + # write the values
  scale_fill_gradient2(mid = "white", 
                       high = alpha("midnightblue", .8), 
                       midpoint = 0) + # determine the colour
  theme(panel.grid.major.x=element_blank(), #no gridlines
        panel.grid.minor.x=element_blank(), 
        panel.grid.major.y=element_blank(), 
        panel.grid.minor.y=element_blank(),
        legend.position = "none",
        panel.background=element_rect(fill="white"), # background=white
        axis.text.x = element_text(angle=90, hjust = 1,vjust=1),
        plot.title = element_text(size=12,face="bold"),
  ) + 
  scale_x_discrete(name="machine") +
  scale_y_discrete(name="gold standard") +
  labs(title = "Dominant form per article")

ggsave("../figures/FigureC1b.pdf", height = 5, width = 8)
ggsave("../figures/FigureC1b.png", height = 5, width = 8, dpi = 1200)

```



## Figure C2

```{r}
y <- gold_sample %>% 
  group_by(dominant_claim, main_annotated_claim = as.factor(main_annotated_claim), .drop = FALSE) %>% 
  summarise(freq = n())

ggplot(y[!is.na(y$main_annotated_claim),], aes(main_annotated_claim, dominant_claim)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = freq)) + # background colours are mapped according to the value column
  geom_text(aes(label = freq)) + # write the values
  scale_fill_gradient2(mid = "white", 
                       high = alpha("midnightblue", .8), 
                       midpoint = 0) + # determine the colour
  theme(panel.grid.major.x=element_blank(), #no gridlines
        panel.grid.minor.x=element_blank(), 
        panel.grid.major.y=element_blank(), 
        panel.grid.minor.y=element_blank(),
        legend.position = "none",
        panel.background=element_rect(fill="white"), # background=white
        axis.text.x = element_text(angle=90, hjust = 1,vjust=1),
        plot.title = element_text(size=12,face="bold"),
  ) +
  scale_x_discrete(name="human annotators") +
  scale_y_discrete(name="gold standard") +
  labs(title = "Dominant claim per article")

ggsave("../figures/FigureC2a.pdf", height = 5, width = 8)
ggsave("../figures/FigureC2a.png", height = 5, width = 8, dpi = 1200)


```
```{r}
y <- gold_sample %>% 
  group_by(dominant_claim, main_machine_claim = as.factor(main_machine_claim), .drop = FALSE) %>% 
  summarise(freq = n())

ggplot(y[!is.na(y$main_machine_claim),], aes(main_machine_claim, dominant_claim)) + # x and y axes => Var1 and Var2
  geom_tile(aes(fill = freq)) + # background colours are mapped according to the value column
  geom_text(aes(label = freq)) + # write the values
  scale_fill_gradient2(mid = "white", 
                       high = alpha("midnightblue", .8), 
                       midpoint = 0) + # determine the colour
  theme(panel.grid.major.x=element_blank(), #no gridlines
        panel.grid.minor.x=element_blank(), 
        panel.grid.major.y=element_blank(), 
        panel.grid.minor.y=element_blank(),
        legend.position = "none",
        panel.background=element_rect(fill="white"), # background=white
        axis.text.x = element_text(angle=90, hjust = 1,vjust=1),
        plot.title = element_text(size=12,face="bold"),
  ) +
  scale_x_discrete(name="machine") +
  scale_y_discrete(name="gold standard") +
  labs(title = "Dominant claim per article")

ggsave("../figures/FigureC2b.pdf", height = 5, width = 8)
ggsave("../figures/FigureC2b.png", height = 5, width = 8, dpi = 1200)

```

## D: Test performance on PolDem data

```{r}
### Protest forms

poldem_forms <- read_delim("../data/PolDem_forms_predicted.csv")

poldem_data  <- read_delim("../data/PolDem_data.csv")

table(poldem_data$action_form)

poldem_forms <- poldem_forms %>% 
  mutate(pform = case_when(prediction == 0 ~ "None",
                           prediction %in% c(2, 3, 9, 11, 19) ~ "confrontations, blockades",
                           prediction %in% c(4, 12) ~ "demonstrations",
                           prediction %in% c(5,7,8,14,20,21,22) ~ "petitions, symbolic actions",
                           prediction == 10 ~ "strikes",
                           prediction %in% c(13,15,16,17,18) ~ "violent protest",
                           prediction %in% c(1,6,97) ~ "other protest"
  ))


poldem_reduced_forms <- poldem_data %>% 
  select(fid = id_doc, doc_source, country_name, action_form, pdate = doc_publdate) %>% 
  mutate(pdate = as.Date(pdate, format = "%m/%d/%y")) %>% 
  group_by(fid) %>% 
  mutate(cid = paste0("F",row_number())) %>% 
  arrange(fid, cid)

poldem_reduced_forms <- poldem_reduced_forms %>% 
  group_by(fid) %>% 
  pivot_wider(id_cols = fid, values_from = action_form, names_from = cid)

predicted_forms <- poldem_forms %>% 
  select(fid, pform) %>% 
  group_by(fid, pform) %>% 
  mutate(form_count = n()) %>% 
  slice(1) %>% 
  arrange(fid)


# join predicted claims and actual claim data
form_comparison <- left_join(predicted_forms, poldem_reduced_forms, by="fid") %>% 
  mutate(eval = 0, .before="F1") %>% 
  group_by(fid) %>% 
  mutate(n_found_forms = n(), .before="F1") %>% 
  ungroup()

# check whether the predicted claims match any of the actual claims in the respective article
for (i in 1:dim(form_comparison)[1]) {
  form_comparison$eval[i] <- 
    case_when((form_comparison$pform[i] %in% form_comparison[i,6:30]) ~ 1,
              .default = 0)
}

eval1_form <- form_comparison %>% 
  mutate(actual = case_when(eval == 1 ~ pform,
                            .default = F1)) %>% 
  select(predicted = pform, actual)


f1_eval_forms <- function(actual,predicted){
#   unpredicted_claims = c(0, 30)
  u <- sort(union(actual, predicted)) # %>% setdiff(unpredicted_claims)
  cm = as.matrix(table(factor(actual, u), factor(predicted, u))) # create the confusion matrix
  n = sum(cm) # number of instances
  nc = nrow(cm) # number of classes
  diag = diag(cm) # number of correctly classified instances per class 
  rowsums = apply(cm, 1, sum) # number of instances per class
  colsums = apply(cm, 2, sum) # number of predictions per class
  p = rowsums / n # distribution of instances over the actual classes
  q = colsums / n # distribution of instances over the predicted classes
  accuracy = sum(diag) / n
  precision = diag / colsums 
  recall = diag / rowsums 
  f1 = 2 * precision * recall / (precision + recall) 
  macroPrecision = mean(precision, na.rm = T)
  macroRecall = mean(recall, na.rm = T)
  macroF1 = mean(f1, na.rm = T)
  df_eval <- rbind(data.frame(precision, recall, f1), data.frame(precision=macroPrecision, recall=macroRecall, f1=macroF1, row.names="macro"))
  
  return(df_eval)
}

df_eval1_form <- f1_eval_forms(eval1_form$actual, eval1_form$predicted)
df_eval1_form$n <- c(table(eval1_form$predicted),NA)

# aggregate at article level
predicted_forms_per_article <- form_comparison %>% 
  select(fid, pform, form_count, eval, F1) %>% 
  rowwise() %>% 
  mutate(eval_sum = sum(eval*form_count)) %>% 
  group_by(fid) %>% 
  arrange(desc(eval_sum), .by_group = T) %>% 
  slice(1) %>% 
  ungroup()

eval3_form <- predicted_forms_per_article %>% 
  mutate(actual = case_when(eval == 1 ~ pform,
                            .default = F1)) %>% 
  select(predicted = pform, actual)

df_eval3_form <- f1_eval_forms(eval3_form$actual, eval3_form$predicted)
df_eval3_form$n <- c(table(factor(eval3_form$predicted, levels=sort(union(eval3_form$actual, eval3_form$predicted)))),NA)

kable(df_eval3_form)
```


```{r}
u <- sort(union(eval3_form$actual, eval3_form$predicted)) # %>% setdiff(unpredicted_claims)
cm = caret::confusionMatrix(table(factor(eval3_form$actual, u), factor(eval3_form$predicted, u))) # create the confusion matrix

# convert new confusion matrix to data frame
hm <- as.data.frame(as.table(cm))
names(hm) = c("Actual", "Predicted", "Freq")

# create confusion matrix with ggplot2
g <- ggplot(hm, aes(x = Predicted, y = Actual, fill = Freq)) +
  geom_tile() + theme_bw() + coord_equal() +
  scale_fill_distiller(palette = "Blues", direction = 1) +
  guides(fill = "none") +
  geom_text(aes(label = Freq), color = "black", size = 6) +
  scale_y_discrete(limits=rev)

ggsave("../figures/PolDemConfusionMatrix.png")

g

```

